library(limma)
library(Glimma)
library(tidyverse)
library(DESeq2)
library(org.Hs.eg.db)
library(pheatmap)
library(EnhancedVolcano)
library(ComplexHeatmap)
library(gplots)
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(DOSE)
library(gridExtra)
library(grid)
There were two sequencing runs conducted on the same samples (same library pool re-sequenced)
Get log fold change volcanoes for both sequencing runs
Decide if these sequencing runs should be combined
set.seed(42)
# These counts were generated on the HPC using RSubRead FeatureCounts package
load("source_data/FeatCounts_Rsub_run35.RData")
counts_run35 <- counts$counts
load("source_data/FeatCounts_Rsub_run36.RData")
counts_run36 <- counts$counts
head(counts_run35[1:4,1:4])
## MERNA1.bam MERNA10.bam MERNA11.bam MERNA12.bam
## 100287102 10 15 12 21
## 653635 498 271 417 423
## 102466751 4 7 8 9
## 100302278 1 0 0 0
counts <- counts_run35
coldata <- tibble::tribble(
~SeqID, ~healthy_donor, ~condition,
"MERNA1", "D1", "IL2",
"MERNA2", "D1", "IL15",
"MERNA3", "D1", "IL21",
"MERNA4", "D1", "IL7",
"MERNA5", "D1", "UST",
"MERNA6", "D2", "IL2",
"MERNA7", "D2", "IL15",
"MERNA8", "D2", "IL21",
"MERNA9", "D2", "IL7",
"MERNA10", "D2", "UST",
"MERNA11", "D3", "IL2",
"MERNA12", "D3", "IL15",
"MERNA13", "D3", "IL21",
"MERNA14", "D3", "IL7",
"MERNA15", "D3", "UST"
)
gene.list <- data.frame(entrez.id = rownames(counts))
gene.list <- gene.list %>%
dplyr::mutate(
SYMBOL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "SYMBOL", keytype = "ENTREZID"),
GENETYPE = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "GENETYPE", keytype = "ENTREZID"),
ENSEMBL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "ENSEMBL", keytype = "ENTREZID")
)
# Rename to SYMBOL
filtered.gene.list <- gene.list %>%
dplyr::filter(
!is.na(SYMBOL),
!duplicated(SYMBOL))
# Rename gene IDs from ensembl to symbol
counts <- counts[rownames(counts) %in% filtered.gene.list$entrez.id, ]
rownames(counts) <- filtered.gene.list$SYMBOL[match(rownames(counts), filtered.gene.list$entrez.id)]
counts[1:4,1:4]
## MERNA1.bam MERNA10.bam MERNA11.bam MERNA12.bam
## DDX11L1 10 15 12 21
## WASH7P 498 271 417 423
## MIR6859-1 4 7 8 9
## MIR1302-2 1 0 0 0
# Remove the ".bam" extension from colnames of counts
colnames(counts) <- sub(".bam", "", colnames(counts))
# Reorder columns of counts to match the order in coldata$SeqID
counts_run35 <- counts[, match(coldata$SeqID, colnames(counts))]
saveRDS(counts_run35, "counts_run35.rds")
dds <- DESeqDataSetFromMatrix(counts_run35, coldata, design = ~healthy_donor + condition)
dds <- estimateSizeFactors(dds)
idx <- rowSums(counts(dds, normalized=TRUE) >= 10 ) >= 3
dds <- dds[idx,]
dds <- DESeq(dds)
vst <- varianceStabilizingTransformation(dds)
vsn::meanSdPlot(assay(vst))
plotPCA(vst, intgroup = c("healthy_donor"))
plotPCA(vst, intgroup = c("condition"))
sizeFactors(dds)
## MERNA1 MERNA2 MERNA3 MERNA4 MERNA5 MERNA6 MERNA7 MERNA8
## 1.8058775 1.4788977 1.2524641 0.8612628 0.7966731 0.7554115 1.4088596 0.8897406
## MERNA9 MERNA10 MERNA11 MERNA12 MERNA13 MERNA14 MERNA15
## 0.9966245 0.6587678 0.9831123 0.6706531 1.3473446 0.9526336 0.9431699
normalized_counts <- counts(dds, normalized=TRUE)
res.il15 <- results(dds, contrast = c("condition","IL21","IL15"))
res.sig.il15 <- subset(res.il15, padj < 0.05)
lfc.il15 <- res.sig.il15[ order(res.sig.il15$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il15)
##
## out of 600 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 122, 20%
## LFC < 0 (down) : 478, 80%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il15
## log2 fold change (MLE): condition IL21 vs IL15
## Wald test p-value: condition IL21 vs IL15
## DataFrame with 600 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CXCL13 86.02403 11.66732 2.55013 4.57518 4.75800e-06 7.75444e-04
## COL6A3 401.77920 7.12769 1.29528 5.50281 3.73786e-08 3.63717e-05
## RDH8 57.46700 7.04912 1.84572 3.81917 1.33902e-04 7.09086e-03
## GREM1 21.04118 6.90119 1.88518 3.66076 2.51472e-04 1.12739e-02
## NCAN 6.16191 6.34385 1.80852 3.50776 4.51896e-04 1.77666e-02
## ... ... ... ... ... ... ...
## RGS8 25.0830 -7.08206 1.58708 -4.46231 8.10807e-06 0.001048143
## OLAH 52.6620 -7.13134 1.38420 -5.15197 2.57758e-07 0.000121453
## SOCS2-AS1 23.8220 -7.18129 1.64787 -4.35792 1.31304e-05 0.001481357
## ENPEP 84.1598 -8.93951 1.78546 -5.00685 5.53280e-07 0.000205096
## HMCN1 117.7320 -9.42187 1.68347 -5.59669 2.18482e-08 0.000027624
res.il2 <- results(dds,contrast = c("condition","IL21","IL2"))
res.sig.il2 <- subset(res.il2, padj < 0.05)
lfc.il2 <- res.sig.il2[ order(res.sig.il2$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il2)
##
## out of 1779 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 609, 34%
## LFC < 0 (down) : 1170, 66%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il2
## log2 fold change (MLE): condition IL21 vs IL2
## Wald test p-value: condition IL21 vs IL2
## DataFrame with 1779 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CXCL13 86.02403 10.05458 2.46592 4.07742 4.55374e-05 1.35042e-03
## IGKV1-9 31.92469 7.22644 1.27779 5.65541 1.55474e-08 1.62022e-06
## IGHV3-74 6.05461 6.73644 1.80041 3.74162 1.82836e-04 4.17046e-03
## IGHV3-48 6.46942 6.73370 1.95990 3.43573 5.90966e-04 1.03495e-02
## COL6A3 401.77920 6.65152 1.29346 5.14242 2.71224e-07 1.91072e-05
## ... ... ... ... ... ... ...
## RGS8 25.0830 -7.82936 1.58029 -4.95439 7.25569e-07 4.31160e-05
## OLAH 52.6620 -7.99767 1.37946 -5.79768 6.72371e-09 8.11483e-07
## DNAJC22 50.1076 -9.07540 1.52727 -5.94224 2.81154e-09 3.91458e-07
## ENPEP 84.1598 -9.88347 1.78332 -5.54216 2.98756e-08 2.76282e-06
## HMCN1 117.7320 -10.32902 1.68161 -6.14235 8.13102e-10 1.34672e-07
lfc.il15.df <- lfc.il15 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')
lfc.il2.df <- lfc.il2 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')
# Set fold change and p-value cutoffs
fold_cutoff <- 2.5
pvalue_cutoff <- 0.05
label_fold_cutoff <- 7
label_pvalue_cutoff <- 0.05
# Filter data based on cutoffs
res.il2.labels <- subset(res.il2, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))
res.il15.labels <- subset(res.il15, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))
res.il2.labels <- unique(paste0(c(rownames(res.il2.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))
res.il15.labels <- unique(paste0(c(rownames(res.il15.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))
filtered_data_intersect <- intersect(res.il2.labels, res.il15.labels)
# Create the first EnhancedVolcano plot for IL21 versus IL2
plot1_run35 <- EnhancedVolcano(res.il2,
lab = rownames(res.il2),
selectLab = filtered_data_intersect,
drawConnectors = TRUE,
x = 'log2FoldChange',
y = 'padj',
title = 'IL21 versus IL2',
pCutoff = pvalue_cutoff,
FCcutoff = fold_cutoff,
pointSize = 3.0,
boxedLabels = TRUE,
labSize = 6.0)
# Create the second EnhancedVolcano plot for IL21 versus IL15
plot2_run35 <- EnhancedVolcano(res.il15,
lab = rownames(res.il15),
selectLab = filtered_data_intersect,
drawConnectors = TRUE,
x = 'log2FoldChange',
y = 'padj',
title = 'IL21 versus IL15',
pCutoff = pvalue_cutoff,
FCcutoff = fold_cutoff,
pointSize = 3.0,
boxedLabels = TRUE,
labSize = 6.0)
# Arrange the two plots side by side using gridExtra
grid.arrange(plot1_run35, plot2_run35, ncol = 2)
counts <- counts_run36
coldata <- tibble::tribble(
~SeqID, ~healthy_donor, ~condition,
"MERNA1", "D1", "IL2",
"MERNA2", "D1", "IL15",
"MERNA3", "D1", "IL21",
"MERNA4", "D1", "IL7",
"MERNA5", "D1", "UST",
"MERNA6", "D2", "IL2",
"MERNA7", "D2", "IL15",
"MERNA8", "D2", "IL21",
"MERNA9", "D2", "IL7",
"MERNA10", "D2", "UST",
"MERNA11", "D3", "IL2",
"MERNA12", "D3", "IL15",
"MERNA13", "D3", "IL21",
"MERNA14", "D3", "IL7",
"MERNA15", "D3", "UST"
)
gene.list <- data.frame(entrez.id = rownames(counts))
gene.list <- gene.list %>%
dplyr::mutate(
SYMBOL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "SYMBOL", keytype = "ENTREZID"),
GENETYPE = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "GENETYPE", keytype = "ENTREZID"),
ENSEMBL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "ENSEMBL", keytype = "ENTREZID")
)
# Rename to SYMBOL
filtered.gene.list <- gene.list %>%
dplyr::filter(
!is.na(SYMBOL),
!duplicated(SYMBOL))
# Rename gene IDs from ensembl to symbol
counts <- counts[rownames(counts) %in% filtered.gene.list$entrez.id, ]
rownames(counts) <- filtered.gene.list$SYMBOL[match(rownames(counts), filtered.gene.list$entrez.id)]
counts[1:4,1:4]
## MERNA1.bam MERNA10.bam MERNA11.bam MERNA12.bam
## DDX11L1 2 20 15 37
## WASH7P 85 487 406 980
## MIR6859-1 0 10 7 33
## MIR1302-2 0 0 0 0
# Remove the ".bam" extension from colnames of counts
colnames(counts) <- sub(".bam", "", colnames(counts))
# Reorder columns of counts to match the order in coldata$SeqID
counts_run36 <- counts[, match(coldata$SeqID, colnames(counts))]
saveRDS(counts_run36, "counts_run36.rds")
dds <- DESeqDataSetFromMatrix(counts_run36, coldata, design = ~healthy_donor + condition)
dds <- estimateSizeFactors(dds)
idx <- rowSums(counts(dds, normalized=TRUE) >= 10 ) >= 3
dds <- dds[idx,]
dds <- DESeq(dds)
vst <- varianceStabilizingTransformation(dds)
vsn::meanSdPlot(assay(vst))
plotPCA(vst, intgroup = c("healthy_donor"))
plotPCA(vst, intgroup = c("condition"))
sizeFactors(dds)
## MERNA1 MERNA2 MERNA3 MERNA4 MERNA5 MERNA6 MERNA7 MERNA8
## 0.3633083 0.2570585 1.1260936 1.7044318 1.5821346 1.4238734 0.6168305 1.5043697
## MERNA9 MERNA10 MERNA11 MERNA12 MERNA13 MERNA14 MERNA15
## 1.2135284 1.3913264 1.1655990 1.7515148 0.6391892 1.0577737 1.3042779
normalized_counts <- counts(dds, normalized=TRUE)
res.il15 <- results(dds, contrast = c("condition","IL21","IL15"))
res.sig.il15 <- subset(res.il15, padj < 0.05)
lfc.il15 <- res.sig.il15[ order(res.sig.il15$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il15)
##
## out of 514 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 88, 17%
## LFC < 0 (down) : 426, 83%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 31)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il15
## log2 fold change (MLE): condition IL21 vs IL15
## Wald test p-value: condition IL21 vs IL15
## DataFrame with 514 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CXCL13 70.6592 8.50599 2.022667 4.20533 2.60697e-05 2.01833e-03
## COL6A3 342.2097 6.69035 1.283910 5.21092 1.87910e-07 9.21659e-05
## TNFRSF8 370.2915 4.51090 1.334518 3.38017 7.24409e-04 2.34372e-02
## BCAT1 1154.2222 4.36151 0.970938 4.49206 7.05370e-06 1.00123e-03
## JCHAIN 113.2250 4.09748 0.872142 4.69818 2.62495e-06 5.23175e-04
## ... ... ... ... ... ... ...
## IL13 52.1665 -5.83101 1.42019 -4.10580 4.02916e-05 0.002685085
## DNAJC22 42.5352 -6.09647 1.51633 -4.02056 5.80610e-05 0.003613931
## OLAH 41.8109 -7.55617 1.53253 -4.93051 8.20136e-07 0.000245281
## HMCN1 96.9493 -7.86271 1.54524 -5.08834 3.61212e-07 0.000138412
## ENPEP 65.9906 -7.91695 1.90355 -4.15905 3.19569e-05 0.002346438
res.il2 <- results(dds,contrast = c("condition","IL21","IL2"))
res.sig.il2 <- subset(res.il2, padj < 0.05)
lfc.il2 <- res.sig.il2[ order(res.sig.il2$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il2)
##
## out of 1648 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 567, 34%
## LFC < 0 (down) : 1081, 66%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il2
## log2 fold change (MLE): condition IL21 vs IL2
## Wald test p-value: condition IL21 vs IL2
## DataFrame with 1648 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CXCL13 70.65920 9.54598 2.07407 4.60254 4.17371e-06 1.94743e-04
## COL6A3 342.20974 6.76980 1.27357 5.31559 1.06310e-07 8.58810e-06
## IGKV2-24 3.55377 6.09640 1.97368 3.08886 2.00928e-03 2.59230e-02
## IGHV3-74 5.27194 5.92522 1.92669 3.07534 2.10266e-03 2.67645e-02
## FCRL5 27.30764 5.61344 1.29536 4.33349 1.46763e-05 5.61754e-04
## ... ... ... ... ... ... ...
## HSPA4L 24.7930 -7.90183 1.41828 -5.57142 2.52666e-08 2.44796e-06
## OLAH 41.8109 -8.40829 1.52690 -5.50676 3.65508e-08 3.41089e-06
## DNAJC22 42.5352 -8.69111 1.48548 -5.85071 4.89475e-09 6.15322e-07
## HMCN1 96.9493 -8.78465 1.54264 -5.69454 1.23704e-08 1.36352e-06
## ENPEP 65.9906 -9.47420 1.89768 -4.99251 5.95995e-07 3.78337e-05
lfc.il15.df <- lfc.il15 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')
lfc.il2.df <- lfc.il2 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')
# Set fold change and p-value cutoffs
fold_cutoff <- 2.5
pvalue_cutoff <- 0.05
label_fold_cutoff <- 7
label_pvalue_cutoff <- 0.05
# Filter data based on cutoffs
res.il2.labels <- subset(res.il2, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))
res.il15.labels <- subset(res.il15, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))
res.il2.labels <- unique(paste0(c(rownames(res.il2.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))
res.il15.labels <- unique(paste0(c(rownames(res.il15.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))
filtered_data_intersect <- intersect(res.il2.labels, res.il15.labels)
# Create the first EnhancedVolcano plot for IL21 versus IL2
plot1_run36 <- EnhancedVolcano(res.il2,
lab = rownames(res.il2),
selectLab = filtered_data_intersect,
drawConnectors = TRUE,
x = 'log2FoldChange',
y = 'padj',
title = 'IL21 versus IL2',
pCutoff = pvalue_cutoff,
FCcutoff = fold_cutoff,
pointSize = 3.0,
boxedLabels = TRUE,
labSize = 6.0)
# Create the second EnhancedVolcano plot for IL21 versus IL15
plot2_run36 <- EnhancedVolcano(res.il15,
lab = rownames(res.il15),
selectLab = filtered_data_intersect,
drawConnectors = TRUE,
x = 'log2FoldChange',
y = 'padj',
title = 'IL21 versus IL15',
pCutoff = pvalue_cutoff,
FCcutoff = fold_cutoff,
pointSize = 3.0,
boxedLabels = TRUE,
labSize = 6.0)
# Arrange the two plots side by side using gridExtra
grid.arrange(plot1_run36, plot2_run36, ncol = 2)
# Create titles for the different runs
title_run35 <- textGrob("Run 35", gp = gpar(fontsize = 40, fontface = "bold"))
title_run36 <- textGrob("Run 36", gp = gpar(fontsize = 40, fontface = "bold"))
# Arrange the plots in a grid layout
grid.arrange(
title_run35, nullGrob(), plot1_run35, plot2_run35,
title_run36, nullGrob(), plot1_run36, plot2_run36,
ncol = 2,
layout_matrix = rbind(c(1, 2),
c(3, 4),
c(5, 6),
c(7, 8)),
heights = c(0.1, 1, 0.1, 1)
)
The following plots show the estimated sequencing saturation for the two sequencing runs conducted for this project. The code and detailed outs are found in this directory. The code is not run in this directory because the sampling is computationally taxing; so I have ran that independently of the main markdown and just showing the final plots here.
The sequencing runs are called 35 and 36 because this is the numbers given by the sequencing hub, and have just used this code as shorthand for describing the separate sequencing runs. Go to the HPC analysis script for the long sequencing name.
The plots for run 35 and run 36 are very similar. They both reach random sampling sequencing saturation. The technical batch effect will be greater than any extra information by combining the sequencing runs. Therefore, the final analysis will just contain sequencing run 35.
The final analysis for the 2024 paper is the same as is shown in the ‘run 35’ chunks - with IL-7 removed as this was not necessary for the paper.